import pandas as pd
import jieba
import re

#######数据源 搜狗实验室  http://www.sogou.com/labs/resource/list_news.php

f = open('news_smart.dat',encoding='utf-8')

findword = u"content>.*<\content"
txtbuffer=f.read()
oldlist=['']
newlist=['']
oldlist=list(txtbuffer)




